library(pacman)
pacman::p_load(
tidyquant,
readr,
ggplot2,
dplyr,
knitr,
ggthemes
)
# data available at
# https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide
data <- read.csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", na.strings = "", fileEncoding = "UTF-8-BOM")
Another useful way to get data and daily updates is with the covid library which can be installed as follows:
devtools::install_github("RamiKrispin/coronavirus")
We can then include it and get updates with
library(coronavirus)
update_dataset()
## No updates are available
To use the dataset we can do the following:
data("coronavirus")
And then take a look and the data we have available
glimpse(coronavirus)
## Rows: 135,020
## Columns: 7
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26,…
## $ province <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ lat <dbl> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, …
## $ long <dbl> 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, …
## $ type <chr> "confirmed", "confirmed", "confirmed", "confirmed", "confirm…
## $ cases <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
Now let’s try a simple plot of UK confirmed case count
uk <- coronavirus %>%
filter(country == "United Kingdom") %>%
filter(type == "confirmed" & cases > 0 & province == "") %>%
select(-lat, -long, -country) %>%
arrange(date)
uk
ggplot(uk, aes(date, cases)) +
geom_line() +
geom_point(alpha=0.2) +
geom_smooth(se=FALSE) +
labs(
title = "Confirmed UK Coronavirus Cases",
x = "Confirmed Cases",
y = "Date"
) +
theme_tufte()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Some simple examples
Top 20 countries
library(dplyr)
summary_df <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
arrange(-total_cases)
## `summarise()` ungrouping output (override with `.groups` argument)
summary_df
New cases in past 24 hours
library(tidyr)
coronavirus %>%
filter(date == max(date)) %>%
select(country, type, cases) %>%
group_by(country, type) %>%
summarise(total_cases = sum(cases)) %>%
pivot_wider(names_from = type,
values_from = total_cases) %>%
arrange(-confirmed)
## `summarise()` regrouping output by 'country' (override with `.groups` argument)
Plotting cases
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
coronavirus %>%
group_by(type, date) %>%
summarise(total_cases = sum(cases)) %>%
pivot_wider(names_from = type, values_from = total_cases) %>%
arrange(date) %>%
mutate(active = confirmed - death - recovered) %>%
mutate(active_total = cumsum(active),
recovered_total = cumsum(recovered),
death_total = cumsum(death)) %>%
plot_ly(x = ~ date,
y = ~ active_total,
name = 'Active',
fillcolor = '#1f77b4',
type = 'scatter',
mode = 'none',
stackgroup = 'one') %>%
add_trace(y = ~ death_total,
name = "Death",
fillcolor = '#E41317') %>%
add_trace(y = ~recovered_total,
name = 'Recovered',
fillcolor = 'forestgreen') %>%
layout(title = "Distribution of Covid19 Cases Worldwide",
legend = list(x = 0.1, y = 0.9),
yaxis = list(title = "Number of Cases"),
xaxis = list(title = "Source: Johns Hopkins University Center for Systems Science and Engineering"))
## `summarise()` regrouping output by 'type' (override with `.groups` argument)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Teemap plot
conf_df <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
arrange(-total_cases) %>%
mutate(parents = "Confirmed") %>%
ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
plot_ly(data = conf_df,
type= "treemap",
values = ~total_cases,
labels= ~ country,
parents= ~parents,
domain = list(column=0),
name = "Confirmed",
textinfo="label+value+percent parent")
Let’s create a date rather than having day, month, year
data$Date <- as.Date(paste(data$year, data$month, data$day, sep="-"), "%Y-%m-%d")
glimpse(data)
## Rows: 29,295
## Columns: 13
## $ dateRep <fct> 12/07/2020…
## $ day <int> 12, 11, 10…
## $ month <int> 7, 7, 7, 7…
## $ year <int> 2020, 2020…
## $ cases <int> 85, 458, 2…
## $ deaths <int> 16, 37, 20…
## $ countriesAndTerritories <fct> Afghanista…
## $ geoId <fct> AF, AF, AF…
## $ countryterritoryCode <fct> AFG, AFG, …
## $ popData2019 <int> 38041757, …
## $ continentExp <fct> Asia, Asia…
## $ Cumulative_number_for_14_days_of_COVID.19_cases_per_100000 <dbl> 10.081028,…
## $ Date <date> 2020-07-1…
Let’s create a date and calculate the cumulative total for each day
daily_global_counts <- data %>%
group_by(Date) %>%
summarise(
total_cases = sum(cases),
total_deaths = sum(deaths)
)
## `summarise()` ungrouping output (override with `.groups` argument)
daily_global_counts
We do a quick sanity check by plotting this data on a line graph (as it is time based)
ggplot(daily_global_counts, aes(Date, total_cases)) +
geom_point(alpha=0.4) +
geom_line() +
geom_smooth(se=FALSE, alpha=0.5, linetype=3) +
labs(
title = "Global COVID-19 Cases\nper day"
) +
ylab('New Cases') + xlab('Date') +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
And the same for deaths
ggplot(daily_global_counts, aes(Date, total_deaths)) +
geom_point(alpha = 0.2) +
geom_line() +
geom_smooth(se=FALSE) +
labs(
title = "Cumulative COVID-19 Cases\nper day",
ylab = "Deaths",
xlab = "Date"
) +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The table above shows the cumulative confirmed cases of COVID-19 worldwide by date. Just reading numbers in a table makes it hard to get a sense of the scale and growth of the outbreak. Let’s draw a line plot to visualize the confirmed cases worldwide.
daily_global_counts
We do the cumulative counts
daily_global_counts$cum_cases = cumsum(daily_global_counts$total_cases)
daily_global_counts$cum_deaths = cumsum(daily_global_counts$total_deaths)
And plot
cum_cases_plot <- ggplot(daily_global_counts, aes(Date, cum_cases)) +
geom_point() +
geom_line() +
labs(xlab = "Date", ylab="Cumulative confirmed cases", title="Cumulative COVID-19 confirmed cases over time")
cum_cases_plot
And the log10 version on the y axis
cum_cases_plot +
scale_y_log10() +
labs(
title = "Cumulative cases (log10)",
caption = "Source: "
)
Let’s now group by countries and look at individual national trends across the world
by_country <- data %>%
group_by(countriesAndTerritories) %>%
summarise(
total_cases = sum(cases),
total_deaths = sum(deaths)
)
## `summarise()` ungrouping output (override with `.groups` argument)
by_country
This is useful but it would also be useful to have some cumulative case and death counts for every country so that we can filter countries we wish to examine
top_cases_by_country <- by_country %>%
filter(total_cases > 100000) %>%
arrange(desc(total_cases))
top_cases_by_country
We can show this on a vertical barchart
library(RColorBrewer)
palette <- brewer.pal(5, "RdYlBu")[-(2:4)]
global_mean <- median(top_cases_by_country$total_cases)
x_start <- global_mean + 20000
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5
ggplot(top_cases_by_country, aes(total_cases, countriesAndTerritories, color=total_cases)) +
geom_point(size = 6) +
geom_segment(aes(xend=100000, yend=countriesAndTerritories), size=1) +
geom_text(aes(label = total_cases), color="white", size=1.5) +
scale_x_continuous("", expand=c(0,0), limits=c(50000,2000000), position="top") +
scale_color_gradientn(colors = palette) +
labs(
title = 'Top Countries by Confirmed Cases',
captions = 'Source: opendata.org'
) +
theme_minimal() +
theme(
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title = element_blank(),
axis.text = element_text(color = "black"),
legend.position = "none"
) +
geom_vline(xintercept = global_mean, color="grey40", linetype=3) +
annotate(
"text",
x = x_start,
y = y_start,
label="The\nglobal\naverage",
vjust = 1, size = 3, color="grey40"
) +
annotate(
"curve",
x = x_start, y=y_start,
xend = x_end, yend = y_end,
arrow = arrow(
length = unit(0.2, "cm"),
type = "closed"
),
color="grey40"
)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_text).
Let’s make a faceted view for each country of the cumulative cases Displaying multiple countries as a facet
by_country <- data %>%
group_by(countriesAndTerritories, Date) %>%
filter(countriesAndTerritories %in% c('China', 'Italy', 'Sweden', 'France', 'Iran', 'Brazil', 'India')) %>%
summarise(
total_cases = sum(cases),
total_deaths = sum(deaths)
) %>%
mutate(
cum_cases = cumsum(total_cases),
cum_deaths = cumsum(total_deaths)
)
## `summarise()` regrouping output by 'countriesAndTerritories' (override with `.groups` argument)
by_country
ggplot(by_country) +
geom_line(aes(Date, total_cases), color='blue') +
facet_wrap(~countriesAndTerritories, ncol=2, scale="free_y") +
geom_col(aes(Date, total_deaths), color='red') +
labs(
title="Cases and Deaths for COVID-19",
subtitle="Shown for select countries with stark\ndifference between reported case/death ratios",
ylab = "Cases & Deaths"
) +
theme_clean()
It is clear that in some countries there under-reporting of deaths. Let’s separate out some key suspects
under_reporting <- by_country %>%
filter(countriesAndTerritories %in% c('China', 'Brazil', 'Iran'))
ggplot(under_reporting) +
geom_line(aes(Date, total_cases), color="steelblue") +
facet_wrap(~countriesAndTerritories) +
geom_col(aes(Date, total_deaths), color='red') +
labs(
title="Countries potentially underreporting case/death ratio"
) +
theme_tq()
Compared to more typical countries
ord_reporting <- by_country %>%
filter(countriesAndTerritories %in% c('France', 'Italy', 'United_Kingdom'))
ggplot(ord_reporting) +
geom_line(aes(Date, total_cases), color="steelblue") +
facet_wrap(~countriesAndTerritories) +
geom_col(aes(Date, total_deaths), color='red') +
labs(
title="Countries reporting typical case/death ratios"
) +
theme_tq()
Prepare the data as usual by getting
uk_data <- data %>%
filter(countriesAndTerritories == 'United_Kingdom') %>%
group_by(Date) %>%
arrange(Date) %>%
summarise(
total_cases = sum(cases),
total_deaths = sum(deaths)
) %>%
mutate(
cum_cases = cumsum(total_cases),
cum_deaths = cumsum(total_deaths)
)
## `summarise()` ungrouping output (override with `.groups` argument)
# summarize(
# cum_sizes = cumsum(cases),
# cum_deaths = cumsum(deaths)
# )
uk_data
Now let’s plot the total daily cases and the cumulative cases:
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
plot1 <- ggplot(uk_data, aes(Date, total_cases)) +
geom_line() +
labs(
title = 'UK Cases per day',
subtitle = 'Jan, 2020 to June 2020',
caption = 'Source: Open Data',
ylab = 'Total cases'
) +
ylim(0, 7500)
plot2 <- ggplot(uk_data, aes(Date, total_deaths)) +
geom_line() +
labs(
title = 'UK Deaths per day',
subtitle = 'Jan, 2020 to June 2020',
caption = 'Source: Open Data',
ylab = 'Total Deaths'
) +
ylim(0, 7500)
grid.arrange(plot1, plot2, nrow=1)
library(tidyquant)
library(zoo)
uk_roll_mean <- uk_data %>%
tq_mutate(
select = total_cases,
mutate_fun = rollapply,
width = 30,
align = 'right',
FUN = mean,
na.rm = TRUE,
col_rename = "mean_30"
) %>%
tq_mutate(
select = total_cases,
mutate_fun = rollapply,
width = 90,
align = "right",
FUN = mean,
na.rm = TRUE,
col_rename = "mean_90"
)
uk_roll_mean %>%
ggplot(aes(Date, total_cases)) +
# geom_point(alpha=0.2) +
geom_line(aes(Date, mean_30), color= palette_light()[[1]], size = 1, linetype = 1) +
geom_line(aes(Date, mean_90), color= palette_light()[[2]], size = 1, linetype = 1) +
labs(
title = "UK Cases per day",
subtitle = "Showing 30 and 90 days moving averages"
) +
scale_color_tq() +
theme(legend.position = "none")
## Warning: Removed 29 row(s) containing missing values (geom_path).
## Warning: Removed 89 row(s) containing missing values (geom_path).
# Custom function to return mean, sd, 95% conf interval
custom_rolling_stats <- function(x, na.rm = TRUE) {
# x = numeric vector
# na.rm = boolean, whether or not to remove NA's
m <- mean(x, na.rm = na.rm)
s <- sd(x, na.rm = na.rm)
hi <- m + 2*s
lo <- m - 2*s
ret <- c(mean = m, stdev = s, hi.95 = hi, lo.95 = lo)
return(ret)
}
Now we can use this function on our UK cases data
uk_data_rollstats <- uk_data %>%
tq_mutate(
select = total_cases,
mutate_fun = rollapply,
width = 30,
align = "right",
by.column = FALSE,
FUN = custom_rolling_stats,
na.rm = TRUE
)
uk_data_rollstats
We now have the data to view
Let’s plot this in a Bollinger Bands style
uk_data_rollstats %>%
ggplot(aes(Date)) +
geom_point(aes(y=total_cases), color="grey40", alpha=0.5) +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
geom_line(aes(y = mean), linetype = 2, size = 1, alpha = 0.7, color="red") +
labs(
title = "UK Cases by day", x = "",
subtitle = "30-Day Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)"
) +
scale_color_tq() +
theme_tq() +
theme(
legend.position = "None"
)
## Warning: Removed 29 row(s) containing missing values (geom_path).
Now let’s try this and play with a variable for the width of the rolling average called roll_width() and also perform the analysis on death data
rollwidth <- 7
# cases plot
plot_1 <- uk_data %>%
tq_mutate(
select = total_cases,
mutate_fun = rollapply,
width = rollwidth,
align = "left",
by.column = FALSE,
FUN = custom_rolling_stats,
na.rm = TRUE
) %>%
ggplot(aes(x = Date)) +
geom_point(aes(y=total_cases), color="grey40", alpha=0.5, position = 'stack') +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
geom_line(aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
labs(
title = "UK Cases by day", x = "",
subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
caption = "Open Data",
ylab = "Total Cases"
) +
scale_color_tq() +
theme_tq() +
theme(
legend.position = "None"
)
# deaths plot
plot_2 <- uk_data %>%
tq_mutate(
select = total_deaths,
mutate_fun = rollapply,
width = rollwidth,
align = "left",
by.column = FALSE,
FUN = custom_rolling_stats,
na.rm = TRUE
) %>%
ggplot(aes(x = Date)) +
geom_point(aes(y=total_deaths), color="grey40", alpha=0.5, position = 'stack') +
geom_ribbon(aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
geom_line(aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
labs(
title = "UK Deaths by day", x = "",
subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
caption = "Open Data",
ylab = "Total Cases"
) +
scale_color_tq() +
theme_tq() +
theme(
legend.position = "None"
)
grid.arrange(plot_1, plot_2, nrow=1)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
Now let’s try to normalise the x-axis and plot both on the same axis
cases_roll_data <- uk_data %>%
tq_mutate(
select = total_cases,
mutate_fun = rollapply,
width = 7,
align = "right",
by.column = FALSE,
FUN = custom_rolling_stats,
na.rm = TRUE
)
deaths_roll_data <- uk_data %>%
tq_mutate(
select = total_deaths,
mutate_fun = rollapply,
width = 7,
align = "right",
by.column = FALSE,
FUN = custom_rolling_stats,
na.rm = TRUE
)
ggplot(NULL, aes(x = Date)) +
geom_point(data=cases_roll_data, aes(y=total_cases), color="grey40", alpha=0.5, position = 'stack') +
geom_ribbon(data=cases_roll_data, aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
geom_line(data=cases_roll_data, aes(y = mean), linetype = 1, size = 1, alpha = 0.8, color="steelblue") +
geom_line(data=deaths_roll_data, aes(y = mean), linetype = 1, size = 0.5, alpha = 0.5, color="red") +
geom_ribbon(data=deaths_roll_data, aes(ymin = lo.95, ymax = hi.95), alpha=0.4) +
labs(
title = "UK Cases & Deaths per day", x = "Date", y = "Cases & Deaths",
subtitle = "Dynamic Moving Average with\n 95% Confidence Interval Bands (+/-2 Standard Deviations)",
caption = "Open Data",
ylab = "Total Cases"
) +
scale_color_tq() +
theme_tq() +
theme(
legend.position = "None"
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).